# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn; seaborn.set()
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
sns.set_theme()
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
# import file
df=pd.read_csv('iris.csv')
# shape of the data
df.shape
(150, 5)
# get basic info
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal_length 150 non-null float64 1 sepal_width 150 non-null float64 2 petal_length 150 non-null float64 3 petal_width 150 non-null float64 4 target 150 non-null int64 dtypes: float64(4), int64(1) memory usage: 6.0 KB
# distribution of the data
plt.figure(figsize=(15,6))
sns.boxplot(x="variable", y="value", showmeans=True, data=pd.melt(df))
plt.show()
# relationship between variables.
sns.pairplot(df);
from pycaret.anomaly import *
setup = setup(df, session_id = 123)
| Description | Value | |
|---|---|---|
| 0 | session_id | 123 |
| 1 | Original Data | (150, 5) |
| 2 | Missing Values | False |
| 3 | Numeric Features | 4 |
| 4 | Categorical Features | 1 |
| 5 | Ordinal Features | False |
| 6 | High Cardinality Features | False |
| 7 | High Cardinality Method | None |
| 8 | Transformed Data | (150, 7) |
| 9 | CPU Jobs | -1 |
| 10 | Use GPU | False |
| 11 | Log Experiment | False |
| 12 | Experiment Name | anomaly-default-name |
| 13 | USI | 0f14 |
| 14 | Imputation Type | simple |
| 15 | Iterative Imputation Iteration | None |
| 16 | Numeric Imputer | mean |
| 17 | Iterative Imputation Numeric Model | None |
| 18 | Categorical Imputer | mode |
| 19 | Iterative Imputation Categorical Model | None |
| 20 | Unknown Categoricals Handling | least_frequent |
| 21 | Normalize | False |
| 22 | Normalize Method | None |
| 23 | Transformation | False |
| 24 | Transformation Method | None |
| 25 | PCA | False |
| 26 | PCA Method | None |
| 27 | PCA Components | None |
| 28 | Ignore Low Variance | False |
| 29 | Combine Rare Levels | False |
| 30 | Rare Level Threshold | None |
| 31 | Numeric Binning | False |
| 32 | Remove Outliers | False |
| 33 | Outliers Threshold | None |
| 34 | Remove Multicollinearity | False |
| 35 | Multicollinearity Threshold | None |
| 36 | Remove Perfect Collinearity | False |
| 37 | Clustering | False |
| 38 | Clustering Iteration | None |
| 39 | Polynomial Features | False |
| 40 | Polynomial Degree | None |
| 41 | Trignometry Features | False |
| 42 | Polynomial Threshold | None |
| 43 | Group Features | False |
| 44 | Feature Selection | False |
| 45 | Feature Selection Method | classic |
| 46 | Features Selection Threshold | None |
| 47 | Feature Interaction | False |
| 48 | Feature Ratio | False |
| 49 | Interaction Threshold | None |
Specify the session id, this results in processing after execution. It interprets numerous types of variables automatically and allows us to confirm by pressing ENTER to continue.
Observe that our dataset consists of 5, 150 rows each. We can perform various imputations- numeric and categorical or normalize the data. But we don’t require such transformations in our dataset so let us continue!
Performing all these computations with a few lines of code presents the beauty of the PyCaret library.
# get list of models.
models()
| Name | Reference | |
|---|---|---|
| ID | ||
| abod | Angle-base Outlier Detection | pyod.models.abod.ABOD |
| cluster | Clustering-Based Local Outlier | pyod.models.cblof.CBLOF |
| cof | Connectivity-Based Local Outlier | pyod.models.cof.COF |
| iforest | Isolation Forest | pyod.models.iforest.IForest |
| histogram | Histogram-based Outlier Detection | pyod.models.hbos.HBOS |
| knn | K-Nearest Neighbors Detector | pyod.models.knn.KNN |
| lof | Local Outlier Factor | pyod.models.lof.LOF |
| svm | One-class SVM detector | pyod.models.ocsvm.OCSVM |
| pca | Principal Component Analysis | pyod.models.pca.PCA |
| mcd | Minimum Covariance Determinant | pyod.models.mcd.MCD |
| sod | Subspace Outlier Detection | pyod.models.sod.SOD |
| sos | Stochastic Outlier Selection | pyod.models.sos.SOS |
# building iforest model with estimator as random forest.
iforest_model = tune_model(model = 'iforest', supervised_target ='target', supervised_estimator='rf')
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| 0.0 | 0.9467 | 0.0000 | 0.3800 | 0.4000 | 0.3894 | nan | 0.2000 |
| 0.01 | 0.9452 | 0.0000 | 0.3800 | 0.4000 | 0.3894 | nan | 0.2000 |
| 0.02 | 0.9443 | 0.0000 | 0.3800 | 0.4000 | 0.3894 | nan | 0.2000 |
| 0.03 | 0.9433 | 0.0000 | 0.3790 | 0.4000 | 0.3889 | nan | 0.2000 |
| 0.04 | 0.9429 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.05 | 0.9429 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.06 | 0.9357 | 0.0000 | 0.3714 | 0.4000 | 0.3846 | nan | 0.2000 |
| 0.07 | 0.9429 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.08 | 0.9484 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.09 | 0.9478 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.1 | 0.9467 | 0.0000 | 0.3775 | 0.4000 | 0.3880 | nan | 0.2000 |
# building loc model with estimator as random forest.
lof_model = tune_model(model = 'lof', supervised_target ='target', supervised_estimator='rf')
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| 0.0 | 0.9467 | 0.0000 | 0.3800 | 0.4000 | 0.3894 | nan | 0.2000 |
| 0.01 | 0.9386 | 0.0000 | 0.3733 | 0.3889 | 0.3798 | nan | 0.1873 |
| 0.02 | 0.9381 | 0.0000 | 0.3733 | 0.3889 | 0.3798 | nan | 0.1873 |
| 0.03 | 0.9367 | 0.0000 | 0.3724 | 0.4000 | 0.3852 | nan | 0.2000 |
| 0.04 | 0.9429 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.05 | 0.9429 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.06 | 0.9429 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.07 | 0.9429 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.08 | 0.9412 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.09 | 0.9401 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.1 | 0.9319 | 0.0000 | 0.3703 | 0.4000 | 0.3840 | nan | 0.2000 |
# building cluster model with estimator as random forest.
cluster_model = tune_model(model = 'cluster', supervised_target ='target', supervised_estimator='rf')
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| 0.0 | 0.9467 | 0.0000 | 0.3800 | 0.4000 | 0.3894 | nan | 0.2000 |
| 0.01 | 0.9452 | 0.0000 | 0.3800 | 0.3889 | 0.3835 | nan | 0.1873 |
| 0.02 | 0.9448 | 0.0000 | 0.3800 | 0.3875 | 0.3827 | nan | 0.1875 |
| 0.03 | 0.9367 | 0.0000 | 0.3724 | 0.4000 | 0.3852 | nan | 0.2000 |
| 0.04 | 0.9357 | 0.0000 | 0.3714 | 0.4000 | 0.3846 | nan | 0.2000 |
| 0.05 | 0.9500 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.06 | 0.9429 | 0.0000 | 0.3714 | 0.4000 | 0.3846 | nan | 0.2000 |
| 0.07 | 0.9500 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.08 | 0.9484 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.09 | 0.9478 | 0.0000 | 0.3786 | 0.4000 | 0.3886 | nan | 0.2000 |
| 0.1 | 0.9544 | 0.0000 | 0.3775 | 0.4000 | 0.3880 | nan | 0.2000 |
# assign the cluster model to data frame.
cluster_results = assign_model(cluster_model)
cluster_results.head()
| sepal_length | sepal_width | petal_length | petal_width | target | Anomaly | Anomaly_Score | |
|---|---|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 | 0 | 0.268103 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 | 0 | 0.223945 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 | 0 | 0.141955 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 | 0 | 0.141955 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 | 0 | 0.298188 |
# frequencies of anomolies in the cluster model.
cluster_results.Anomaly.value_counts(normalize=True)
0 0.9 1 0.1 Name: Anomaly, dtype: float64
# 2d view of Anomoly
evaluate_model(cluster_model);
# scatter plot
evaluate_model(cluster_model);
# lets creeate an array for inliner and outlier
inliner=np.array(cluster_results.loc[(cluster_results["Anomaly"])==0]["Anomaly_Score"])
outlier=np.array(cluster_results.loc[(cluster_results["Anomaly"])==1] ["Anomaly_Score"])
# create a histrogram for inliners and outliers.
x=inliner
y=outlier
plt.figure(figsize=(10,4))
plt.hist(x,alpha=0.5,label='inliner')
plt.hist(y,alpha=0.5, label='Outlier')
plt.legend(loc='upper left')
plt.show()
# pairplot of the cluster model with anomoly as hue
sns.pairplot(cluster_results, hue = "Anomaly");